Computational research group in Granada university and GENyO research Centre. Expertise and interests in:
# Load data
train <- pseq(inputdir = inputdir, subset = "train")
test <- pseq(inputdir = inputdir, subset = "scoring")
# Agglomerate by Species
train <- tax_glom(train, taxrank = "Species")
train <- subset_taxa(train, Species != "s__")
test <- tax_glom(test, taxrank = "Species")
test <- subset_taxa(test, Species != "s__")
print(train)
phyloseq-class experiment-level object
otu_table() OTU Table: [ 4868 taxa and 3615 samples ]
sample_data() Sample Data: [ 3615 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 4868 taxa by 7 taxonomic ranks ]
print(test)
phyloseq-class experiment-level object
otu_table() OTU Table: [ 4868 taxa and 1809 samples ]
sample_data() Sample Data: [ 1809 samples by 10 sample variables ]
tax_table() Taxonomy Table: [ 4868 taxa by 7 taxonomic ranks ]
# Relative abundance
train_relab <- transform_sample_counts(train, function(x) x / sum(x) )
test_relab <- transform_sample_counts(test, function(x) x / sum(x) )
# Calculate disbiosis scores
train_filt <- subset_taxa(train_relab, Phylum %in% c("p__Firmicutes","p__Bacteroidetes"))
test_filt <- subset_taxa(test_relab, Phylum %in% c("p__Firmicutes","p__Bacteroidetes"))
calculate_f_b_ratio <- function(data){
f_tax <- subset_taxa(data, Phylum == "p__Firmicutes")
b_tax <- subset_taxa(data, Phylum == "p__Bacteroidetes")
f_abundance <- sample_sums(f_tax)
b_abundance <- sample_sums(b_tax)
return(f_abundance / b_abundance)
}
disbiosis_train <- calculate_f_b_ratio(train_filt)
disbiosis_test <- calculate_f_b_ratio(test_filt)
sample_data(train)$disbiosis <- disbiosis_train
sample_data(test)$disbiosis <- disbiosis_test
print(head(disbiosis_train))
Simulated_328 Simulated_1644 Simulated_1710 Simulated_1732 Simulated_1727 Simulated_2196
1.1275385 0.0509096 0.4192907 0.4470816 0.8496063 1.4721705
# Relative Abundance of phylos
train_ph <- tax_glom(train_relab, taxrank = "Phylum")
test_ph <- tax_glom(test_relab, taxrank = "Phylum")
train_ph <- subset_taxa(train_ph, Domain %in% c("k__Archaea", "k__Bacteria"))
test_ph <- subset_taxa(test_ph, Domain %in% c("k__Archaea", "k__Bacteria"))
train_phylos <- t(train_ph@otu_table@.Data)
colnames(train_phylos) <- tax_table(train_ph)[,2]
test_phylos <- t(test_ph@otu_table@.Data)
colnames(test_phylos) <- tax_table(test_ph)[,2]
dim(train_phylos)
[1] 3615 36
print(colnames(train_phylos))
Taxonomy Table: [36 taxa by 1 taxonomic ranks]:
Phylum
taxid_9 "p__Crenarchaeota"
taxid_148 "p__Euryarchaeota"
taxid_261 "p__Thaumarchaeota"
taxid_295 "p__Acidobacteria"
taxid_355 "p__Actinobacteria"
taxid_1237 "p__Aquificae"
taxid_1256 "p__Armatimonadetes"
taxid_1281 "p__Bacteroidetes"
taxid_1732 "p__Caldiserica"
taxid_1733 "p__Calditrichaeota"
taxid_1746 "p__Chlamydiae"
taxid_1760 "p__Chlorobi"
taxid_1787 "p__Chloroflexi"
taxid_1798 "p__Chrysiogenetes"
taxid_1802 "p__Cyanobacteria"
taxid_1809 "p__Deferribacteres"
taxid_1823 "p__Deinococcus-Thermus"
taxid_1856 "p__Dictyoglomi"
taxid_1858 "p__Elusimicrobia"
taxid_1861 "p__Fibrobacteres"
taxid_2764 "p__Firmicutes"
taxid_2951 "p__Fusobacteria"
taxid_2971 "p__Gemmatimonadetes"
taxid_2974 "p__Ignavibacteriae"
taxid_2976 "p__Kiritimatiellaeota"
taxid_2977 "p__Lentisphaerae"
taxid_2978 "p__Nitrospinae"
taxid_2985 "p__Nitrospirae"
taxid_2996 "p__Planctomycetes"
taxid_4500 "p__Proteobacteria"
taxid_5098 "p__Spirochaetes"
taxid_5109 "p__Synergistetes"
taxid_5179 "p__Tenericutes"
taxid_5250 "p__Thermodesulfobacteria"
taxid_5272 "p__Thermotogae"
taxid_5295 "p__Verrucomicrobia"
# Calculate richness
richness_train <- estimate_richness(
train,
split = TRUE,
measures = c("Shannon", "Observed",
"Chao1", "Simpson",
"InvSimpson", "Fisher"))
richness_test <- estimate_richness(
test,
split = TRUE,
measures = c("Shannon", "Observed",
"Chao1", "Simpson",
"InvSimpson", "Fisher"))
# Filter taxa by counts
train <- filter_taxa(train,
function(x) sum(x > 2) > (0.8 * length(x)), TRUE)
# Calculate co-abundances
coab_taxa <- co_abundances(train)
DT::datatable(coab_taxa)
# Caculate GSVA
scores <- get_scores(coab_taxa, train, test, method = "gsva")
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels
|
| | 0%
|
|=== | 4%
|
|======= | 8%
|
|========== | 12%
|
|============= | 17%
|
|================= | 21%
|
|==================== | 25%
|
|======================= | 29%
|
|=========================== | 33%
|
|============================== | 38%
|
|================================= | 42%
|
|===================================== | 46%
|
|======================================== | 50%
|
|=========================================== | 54%
|
|=============================================== | 58%
|
|================================================== | 62%
|
|===================================================== | 67%
|
|========================================================= | 71%
|
|============================================================ | 75%
|
|=============================================================== | 79%
|
|=================================================================== | 83%
|
|====================================================================== | 88%
|
|========================================================================= | 92%
|
|============================================================================= | 96%
|
|================================================================================| 100%
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels
|
| | 0%
|
|=== | 4%
|
|======= | 8%
|
|========== | 12%
|
|============= | 17%
|
|================= | 21%
|
|==================== | 25%
|
|======================= | 29%
|
|=========================== | 33%
|
|============================== | 38%
|
|================================= | 42%
|
|===================================== | 46%
|
|======================================== | 50%
|
|=========================================== | 54%
|
|=============================================== | 58%
|
|================================================== | 62%
|
|===================================================== | 67%
|
|========================================================= | 71%
|
|============================================================ | 75%
|
|=============================================================== | 79%
|
|=================================================================== | 83%
|
|====================================================================== | 88%
|
|========================================================================= | 92%
|
|============================================================================= | 96%
|
|================================================================================| 100%
# Create train and test data
pheno_train <- sample_data(train)
x_train <- data.frame(pheno_train[, -c(8, 9)],
richness_train,
train_phylos,
scores$train)
y_train <- pheno_train[, c("Event_time", "Event")]
pheno_test <- sample_data(test)
x_test <- data.frame(pheno_test,
richness_test,
test_phylos,
scores$test)
PrevalentHFAIL variable# Check data
# ======
stopifnot(ncol(x_train) == ncol(x_test))
stopifnot(colnames(x_train) == colnames(x_test))
stopifnot(nrow(x_train) == nrow(y_train))
# Creating train and test data
train <- cbind.data.frame(x_train, y_train)
test <- as.data.frame(x_test)
# What do we do with ...
# ======
# NA's values in train and test?
train <- train[complete.cases(train), ]
test <- missRanger(test, pmm.k = 10, seed = 153)
Missing value imputation by random forests
Variables to impute: Smoking, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol
Variables used to impute: Age, BodyMassIndex, Smoking, BPTreatment, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Sex, disbiosis, Observed, Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, p__Crenarchaeota, p__Euryarchaeota, p__Thaumarchaeota, p__Acidobacteria, p__Actinobacteria, p__Aquificae, p__Armatimonadetes, p__Bacteroidetes, p__Caldiserica, p__Calditrichaeota, p__Chlamydiae, p__Chlorobi, p__Chloroflexi, p__Chrysiogenetes, p__Cyanobacteria, p__Deferribacteres, p__Deinococcus.Thermus, p__Dictyoglomi, p__Elusimicrobia, p__Fibrobacteres, p__Firmicutes, p__Fusobacteria, p__Gemmatimonadetes, p__Ignavibacteriae, p__Kiritimatiellaeota, p__Lentisphaerae, p__Nitrospinae, p__Nitrospirae, p__Planctomycetes, p__Proteobacteria, p__Spirochaetes, p__Synergistetes, p__Tenericutes, p__Thermodesulfobacteria, p__Thermotogae, p__Verrucomicrobia, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, cluster_20, cluster_21, cluster_22, cluster_23, cluster_24
iter 1: ......
iter 2: ......
iter 3: ......
iter 4: ......
# Negatives survival values in train and test?
train <- train[-which(train$Event_time < 0), ]
train$Event_time <- train$Event_time + 0.1
#test$Event_time[which(test$Event_time < 0)] <- 15
# Removing PrevalentHFAIL (only 0)
train <- train[, -grep("PrevalentHFAIL", colnames(train))]
test <- test[, -grep("PrevalentHFAIL", colnames(test))]
DT::datatable(train)
Warning in instance$preRenderHook(instance): It seems your data is too big for client-
side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/
server.html
lasso as feature selection embedded methodAge, BodyMassIndex, SystolicBP, NonHDLcholesterol, Sex, disbiosis as covariates# Fit model
# ========
method = "lasso"
cvrts <- c("Age", "BodyMassIndex",
"SystolicBP", "NonHDLcholesterol",
"Sex", "disbiosis")
models <- fit_biospear(data = train,
biomarkers = setdiff(colnames(train),
c(cvrts, colnames(y_train))),
surv = c("Event_time", "Event"),
cvrts = cvrts,
inter = FALSE,
methods = method)
Computing selection with method: lasso
print(models)
$lasso
Age BodyMassIndex SystolicBP NonHDLcholesterol
0.044753515 0.011847561 0.002076959 -0.094485660
Sex disbiosis PrevalentCHD InvSimpson
0.032552521 0.005092198 0.050703722 -0.019780689
p__Armatimonadetes p__Chlorobi p__Kiritimatiellaeota p__Nitrospinae
-0.032408719 -0.028157328 0.008537367 0.075069928
cluster_5
0.097205879
# Build cox with all train data
# ====
selected_betas <- models$lasso
train <- train[, match(
c("Event", "Event_time", names(selected_betas)),
colnames(train))]
surv <- "Surv(Event_time, Event) ~"
f <- as.formula(paste0(surv, paste(names(selected_betas),collapse='+')))
model <- coxph(f, train)
# Risk Prediction
# ====
risk = function(model, newdata, time) {
as.numeric(1-summary(
survfit(model, newdata = newdata, se.fit = F, conf.int = F),
times = time, extend = TRUE)$surv)
}
pred <- risk(model, test, time = 15)
# Save scores
scores <- data.frame(
SampleID = rownames(test),
Score = pred
)
print(head(scores))
SampleID Score
1 Simulated_2211 0.03586051
2 Simulated_1629 0.02333127
3 Simulated_1690 0.01572205
4 Simulated_1367 0.03958909
5 Simulated_3387 0.02845292
6 Simulated_1746 0.04254048
#write.csv(scores, quote = FALSE, row.names = FALSE,
# file = file.path(outputdir, "scores.csv"))
end <- Sys.time()
time <- difftime(end, start, units = "mins")
print(time)
Time difference of 12.9126 mins